# -*- coding:utf8 -*-
# !/usr/bin/env python

"""
#全国企业信用信息公示系统（四川）
#维护黄羽
"""

import re
# import urllib2
from bs4 import BeautifulSoup
from utils import kill_captcha
import traceback
import requests
import copy
import time

from scpy.logger import get_logger
from scpy.xawesome_time import parse_time
import table
import sd_template_dict as TE
# from get_page import *
import sys

reload(sys)
sys.setdefaultencoding("utf-8")

logger = get_logger(__file__)

TIME_OUT = 30

ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36"


def clean_case_info(raw_cf_table):
    if not raw_cf_table:
        return {}
    raw_cf_table = re.sub('\s', '', raw_cf_table)
    td_s = re.findall('<td.*?>(.*?)</td>', raw_cf_table, re.S)
    if td_s and len(td_s) == 8:
        # 行政处罚
        case_dict = {
            'caseTime': '',  # 案发时间
            'caseReason': '',  # 案由
            'caseType': '',  # 案件类型
            'exeSort': '',  # 执行类别
            'caseResult': '',  # 案件结果
            'pendecissDate': td_s[7],  # 处罚决定书签发日期
            'penAuth': td_s[6],  # 处罚机关
            'illegFact': '',  # 主要违法事实
            'penBasis': td_s[4],  # 处罚依据
            'penType': '',  # 处罚种类
            'penResult': td_s[5],  # 处罚结果
            'penAm': '',  # 处罚金额
            'penExest': '',  # 处罚执行情况
        }

        return case_dict

    else:
        return {}


def download_captcha_kill(companyName):
    """
    下载验证码，破解，然后搜索公司
    :param companyName:
    :return:若验证码破解成功且公司存在公司，返回公司网页。
            若公司不存在返回None
            若破解的验证码错误，返回''
            若破解过程、访问网页出现失败，抛出异样
    """
    # 下载验证码
    index_url = "http://gsxt.scaic.gov.cn/ztxy.do"
    req = requests.session()
    random_time_index = '%s' % int(time.time() * 1000)
    index_headers = {'User-Agent': ua, 'Referer': 'http://gsxt.scaic.gov.cn/', 'Host': 'gsxt.scaic.gov.cn'}
    req.headers = index_headers
    # 假请求 must
    index_response = req.get(index_url, timeout=TIME_OUT, params={'method': 'index', 'random': '%s' % random_time_index})

    index_headers['Referer'] = "http://gsxt.scaic.gov.cn/ztxy.do?method=index&random=" + random_time_index
    req.headers = index_headers

    img_url = 'http://gsxt.scaic.gov.cn/ztxy.do'
    random_time_img = '%s' % int(time.time() * 1000)
    img_req_query = {
        "method": "createYzm",
        "dt": random_time_img,
        "random": random_time_img,
    }
    img = req.get(img_url, params=img_req_query, timeout=TIME_OUT).content

    try:
        res_code = kill_captcha(img, 'sc', 'jpg')
        # print 'res code: ', res_code
    except Exception, e:
        logger.error("破解验证码的服务出现异常")
        logger.error(e)
        raise e
    if not res_code or len(res_code) > 100 or str(res_code) in ['None', 'wrong']:
        logger.info('验证码为:%s' % res_code)
        logger.error("破解验证码的服务出现异常,可能是下载的验证码错误，也可能破解服务出现异常！")
        return ''  # 返回空字符串，用于重复破解

    check_url = 'http://gsxt.scaic.gov.cn/keyword.do'
    random_time = '%s' % int(time.time() * 1000)
    check_req_query = {
        "method": "keywordFilter",
        "random": random_time,
    }
    ccn = companyName.decode(encoding='utf-8').encode(encoding='gbk')
    check_req_data = {"qymc": ccn, }
    check_res = req.post(check_url, params=check_req_query, data=check_req_data, timeout=TIME_OUT).content
    res_code = res_code.decode(encoding='utf-8').encode(encoding='gbk')

    form_str = '请输入营业执照注册号或统一社会信用代码'.decode(encoding='utf-8').encode(encoding='gbk')
    search_req_data = {
        'currentPageNo': '1',
        'yzm': res_code,
        'pName': form_str,
        'maent.entname': ccn,
        'BA_ZCH': form_str,
    }
    random_time = '%s' % int(time.time() * 1000)
    search_url = 'http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=%s' % random_time
    search_res = req.post(search_url, search_req_data, timeout=TIME_OUT).content
    search_res_soup = BeautifulSoup(search_res, "html5lib")
    search_res_soup_str = str(search_res_soup)

    if "var flag = 'fail';" in search_res_soup_str:
        logger.info('验证码错误, 验证码为:%s' % res_code)
        return ''

    if '您搜索的条件无查询结果。</div>' in search_res_soup_str:
        logger.info("搜索的公司不存在")
        return None

    a_com = re.findall('''onclick="openView\('(.*?)','(.*?)','(.*?)'\)">''', search_res_soup_str)
    if not a_com:
        logger.info("搜索的公司不存在")
        return None
    else:
        logger.info("搜索的公司存在！")
        return a_com[0]


def get_company_info(com_info):
    """
    下载网页、年报网页
    :param com_info:首页的网页
    :return:公司源码字典
    """
    if not com_info or len(com_info) < 3:
        raise Exception("com_list 错误")
    pripid = com_info[0]
    entbigtype = com_info[1]

    raw_dict = {
        "province": "sc",
        "type": "1",
        "html": "",
        "yearList": [],
        "keyword": "",
        "companyName": "",
        "json": "",
    }
    raw_base_dict = {}

    root_url = 'http://gsxt.scaic.gov.cn/ztxy.do'
    req = requests.session()
    req.headers = {'User-Agent': ua}
    base_req_data = {
        'method': 'qyInfo',
        'djjg': '',
        'maent.pripid': pripid,
        'maent.entbigtype': entbigtype,
        'random': '%s' % int(time.time() * 1000),
    }
    raw_base_res = req.post(root_url, data=base_req_data, timeout=TIME_OUT).content
    raw_base_res = str(BeautifulSoup(raw_base_res, 'html5lib'))
    raw_base_dict['base'] = raw_base_res

    # 股东详情
    share_detail_list = []
    share_detail_s = re.findall('''onclick="showRyxx\(\'(.*?)\',\'(.*?)\'\)"''', raw_base_res, re.S)
    for a_share in share_detail_s:
        share_detail_req_data = {
            'method': 'tzrCzxxDetial',
            'maent.xh': a_share[0],
            'maent.pripid': a_share[1],
            'random': '%s' % int(time.time() * 1000),
        }
        try:
            raw_a_share_detail_res = req.post(root_url, data=share_detail_req_data, timeout=TIME_OUT).content
            raw_a_share_detail_res = str(BeautifulSoup(raw_a_share_detail_res, 'html5lib'))
        except:
            time.sleep(2)
            continue
        share_detail_list.append(raw_a_share_detail_res)

    raw_base_dict['share_detail'] = share_detail_list

    # 备案信息
    ba_req_data = {
        'method': 'baInfo',
        'maent.pripid': pripid,
        'czmk': 'czmk2',
        'random': '%s' % int(time.time() * 1000),
    }
    try:
        raw_ba_res = req.post(root_url, data=ba_req_data, timeout=TIME_OUT).content
        raw_ba_res = str(BeautifulSoup(raw_ba_res, 'html5lib'))
    except:
        raw_ba_res = ''
    raw_base_dict['ba'] = raw_ba_res

    # 经营异常
    abnormal_req_data = {
        'method': 'jyycInfo',
        'maent.pripid': pripid,
        'czmk': 'czmk6',
        'random': '%s' % int(time.time() * 1000),
    }
    try:
        raw_abnormal_res = req.post(root_url, data=abnormal_req_data, timeout=TIME_OUT).content
        raw_abnormal_res = str(BeautifulSoup(raw_abnormal_res, 'html5lib'))
    except:
        raw_abnormal_res = ''
    raw_base_dict['abnormal'] = raw_abnormal_res

    # 抽查检查信息
    check_message_req_data = {
        'method': 'ccjcInfo',
        'maent.pripid': pripid,
        'czmk': 'czmk7',
        'random': '%s' % int(time.time() * 1000),
    }
    try:
        raw_check_message_res = req.post(root_url, data=check_message_req_data, timeout=TIME_OUT).content
        raw_check_message_res = str(BeautifulSoup(raw_check_message_res, 'html5lib'))
    except:
        raw_check_message_res = ''
    raw_base_dict['check'] = raw_check_message_res

    # 动产抵押
    dcdy_req_data = {
        'method': 'dcdyInfo',
        'maent.pripid': pripid,
        'czmk': 'czmk4',
        'random': '%s' % int(time.time() * 1000),
    }
    try:
        raw_dcdy_res = req.post(root_url, data=dcdy_req_data, timeout=TIME_OUT).content
        raw_dcdy_res = str(BeautifulSoup(raw_dcdy_res, 'html5lib'))
    except:
        raw_dcdy_res = ''
    raw_base_dict['dcdy'] = raw_dcdy_res

    # 股权出质登记信息
    gqczxx_req_data = {
        'method': 'gqczxxInfo',
        'maent.pripid': pripid,
        'czmk': 'czmk4',
        'random': '%s' % int(time.time() * 1000),
    }
    try:
        raw_gqczxx_res = req.post(root_url, data=gqczxx_req_data, timeout=TIME_OUT).content
        raw_gqczxx_res = str(BeautifulSoup(raw_gqczxx_res, 'html5lib'))
    except:
        raw_gqczxx_res = ''
    raw_base_dict['gqczxx'] = raw_gqczxx_res

    # 行政处罚信息
    cf_req_data = {
        'method': 'cfInfo',
        'maent.pripid': pripid,
        'czmk': 'czmk3',
        'random': '%s' % int(time.time() * 1000),
    }
    try:
        raw_cf_res = req.post(root_url, data=cf_req_data, timeout=TIME_OUT).content
        raw_cf_res = str(BeautifulSoup(raw_cf_res, 'html5lib'))
    except:
        raw_cf_res = ''

    # 处罚详情
    cf_detail_s = re.findall('''onclick="doXzfyDetail\(\'(.*?)\',\'(.*?)\'\);''',
                             str(BeautifulSoup(raw_cf_res, 'html5lib')))
    raw_cf_detail_list = []
    for a_cf_detail in cf_detail_s:
        cf_detail_data = {
            'method': 'doXzfyDetail',
            'maent.pripid': a_cf_detail[0],
            'maent.xh': a_cf_detail[1],
            'random': '%s' % int(time.time() * 1000),
        }
        try:
            raw_a_detail_cf_res = req.post(root_url, data=cf_detail_data, timeout=TIME_OUT).content
            raw_a_detail_cf_res = str(BeautifulSoup(raw_a_detail_cf_res, 'html5lib'))
            raw_cf_detail_list.append(raw_a_detail_cf_res)
        except:
            continue
    raw_base_dict['cf_detail'] = raw_cf_detail_list

    raw_dict['html'] = raw_base_dict

    """
    # 年报信息
    """
    year_index_req_data = {
        'method': 'qygsInfo',
        'maent.pripid': pripid,
        'czmk': 'czmk8',
        'random': '%s' % int(time.time() * 1000),
    }
    try:
        raw_year_index_res = req.post(root_url, data=year_index_req_data, timeout=TIME_OUT).content
    except:
        raw_year_index_res = ''
    year_s = re.findall('''onclick="doNdbg\(\'(.*?)\'\);''', raw_year_index_res, re.S)
    raw_year_report_list = []
    for a_year in year_s:
        raw_year_report_dict = {}
        logger.info("正在获取 %s" % a_year)
        raw_year_report_dict["year"] = a_year
        year_req_data = {
            'method': 'ndbgDetail',
            'maent.pripid': pripid,
            'maent.nd': str(a_year),
            'random': '%s' % int(time.time() * 1000),
        }
        try:
            raw_year_res = req.post(root_url, data=year_req_data, timeout=TIME_OUT).content
            raw_year_res = str(BeautifulSoup(raw_year_res, 'html5lib'))
        except:
            raw_year_res = ""

        raw_year_report_dict["year_base"] = raw_year_res

        raw_year_report_list.append(raw_year_report_dict)

    raw_dict['yearList'] = raw_year_report_list

    return raw_dict


def clean_shareHolderList(a_table):
    a_table = table.tr(a_table)
    detail = re.findall('<tr>.*?</tr>', a_table)
    # print detail
    del detail[0]
    del detail[0]
    if detail:
        shareHolderList = []
        for i in detail:
            i = table.td_clean(i)
            # print i
            detail_td = re.findall("<td>(.*?)</td>", i)
            # print detail_td
            try:
                shareholderType = detail_td[3]
                shareholderName = detail_td[0]
                dic1 = {"shareholderType": shareholderType, "shareholderName": shareholderName, 'country': '',
                        'subConam': '', 'regCapCur': '', 'conDate': '', 'fundedRatio': ''}
                shareHolderList.append(dic1)
            except:
                continue
        return shareHolderList
    else:
        shareHolderList = []
        return shareHolderList


def extract_base_info(raw_dict):
    if not raw_dict:
        raise Exception("raw_dict 错误")

    raw_html = raw_dict.get("html", {})
    if not raw_html:
        raise Exception("raw_dict 错误")

    # 基本信息
    raw_base = raw_html.get("base")
    raw_base = str(BeautifulSoup(raw_base, 'html5lib'))
    raw_base_table = table.table_clean(raw_base, "基本信息")
    if not raw_base_table:
        raise Exception("基本信息错误")
    res_dict = copy.deepcopy(TE.void_base_dict)
    res_dict["basicList"] = table.index("基本信息", raw_base_table)
    res_dict["province"] = "sc"

    # 股东信息
    share_table = table.table_clean(raw_base, "股东信息") or table.table_clean(raw_base, "股东（发起人）信息")
    share_holder_list_1 = clean_shareHolderList(share_table) if share_table else []

    # todo
    # 股东信息详情
    raw_share_detail = raw_html.get("share_detail", [])
    for a_share_detail in raw_share_detail:
        a_share_detail = str(BeautifulSoup(a_share_detail, 'html5lib')) if a_share_detail else ''
        a_share_detail_table = re.findall("(<table.*?</table>)", a_share_detail, re.S)
        if a_share_detail_table:
            trs = re.findall("(<tr.*?</tr>)", a_share_detail_table[0], re.S)
            if trs and len(trs) > 3:
                tds = re.findall("<td.*?>(.*?)</td>", trs[3], re.S)
                if tds and len(tds) == 9:
                    shareHolder_dict = {
                        'shareholderName': tds[0],
                        'shareholderType': '',  # 股东类型
                        'country': '',  # 国别
                        'subConam': table.money_notclean(tds[1]) if tds[1] else '',  # 认缴出资额(单位:万元)
                        'regCapCur': '',  # 币种
                        'conDate': parse_time(tds[5]) or parse_time(tds[8]),  # 出资日期
                        'fundedRatio': '',  # 出资比例
                        # 'funded': '',
                    }
                    # 融合股东详情页和首页的股东信息
                    for iii, a_detail_1 in enumerate(share_holder_list_1):
                        if a_detail_1.get("shareholderName", "") == tds[0] and tds[0]:
                            shareHolder_dict["shareholderType"] = a_detail_1.get("shareholderType", "")
                            share_holder_list_1[iii] = copy.deepcopy(shareHolder_dict)
    res_dict["shareHolderList"] = share_holder_list_1

    # 变更信息
    alter_table = table.table_clean(raw_base, "变更信息") if raw_base else ''
    alter_list = table.index("变更信息", alter_table) if alter_table else []
    for a_alter in alter_list:
        for k, v in a_alter.items():
            a_alter[k] = re.sub('<.*?>', '', v).replace("收起更多", "")
    res_dict['alterList'] = alter_list

    raw_ba = raw_html.get("ba", "")
    raw_ba = str(BeautifulSoup(raw_ba, 'html5lib')) if raw_ba else ''
    # 主要人员信息
    person_table = table.table_clean(raw_ba, "主要人员信息") or table.table_clean(raw_base, "家庭成员信息") if raw_ba else ''
    res_dict['personList'] = table.index("主要人员信息", person_table) if person_table else []

    # 分支机构信息
    branch_table = table.table_clean(raw_ba, "分支机构信息") if raw_ba else ''
    res_dict['filiationList'] = table.index("分支机构信息", branch_table) if branch_table else []

    # 清算信息
    liquidation_table = table.table_clean(raw_ba, "清算信息") if raw_ba else ''
    res_dict['liquidationList'] = table.index("清算信息", liquidation_table) if liquidation_table else []

    # 经营异常信息
    raw_abnormal = raw_html.get("abnormal", "")
    raw_abnormal = str(BeautifulSoup(raw_abnormal, 'html5lib')) if raw_abnormal else ''
    abnormal_table = table.table_clean(raw_abnormal, "经营异常信息") if raw_abnormal else ''
    res_dict['abnormalOperation'] = table.index("经营异常信息", abnormal_table) if abnormal_table else []

    # 抽查检查信息
    raw_check = raw_html.get("check", "")
    raw_check = str(BeautifulSoup(raw_check, 'html5lib')) if raw_check else ''
    check_table = table.table_clean(raw_check, "抽查检查信息") if raw_check else ''
    res_dict['checkMessage'] = table.index("抽查检查信息", check_table) if check_table else []

    """
    # 行政处罚
    """
    raw_cf = raw_html.get("cf_detail", []) or []
    case_info_list = []
    for a_cf in raw_cf:
        a_cf_soup = BeautifulSoup(a_cf, 'html5lib') if a_cf else ''
        # a_cf_soup = BeautifulSoup(a_cf) if a_cf else ''
        a_cf = str(a_cf_soup) if a_cf_soup else ''
        cf_table = table.table_clean(a_cf, "详情摘要") if a_cf else ''
        a_cf_dict = clean_case_info(cf_table) if cf_table else {}
        if a_cf_dict:
            a_tag = a_cf_soup.find('div', attrs={'class': 'gscf1'})
            if a_tag:
                a_cf_dict['penExest'] = str(a_tag.text)
            case_info_list.append(a_cf_dict)

    res_dict['caseInfoList'] = case_info_list

    return res_dict


def extract_year_info(raw_dict):
    if not raw_dict:
        raise Exception("raw_dict 错误")

    raw_year_list = raw_dict.get("yearList", [])

    res_year_list = []

    for a_raw_year_item in raw_year_list:
        res_year_dict = copy.deepcopy(TE.void_year_dict)
        res_year_dict["year"] = a_raw_year_item.get("year", "")
        raw_year_base = a_raw_year_item.get("year_base", "")
        raw_year_base = str(BeautifulSoup(raw_year_base, 'html5lib')) if raw_year_base else ''

        # 基本信息
        year_base_table = table.table_clean(raw_year_base, "企业基本信息") or table.table_clean(raw_year_base, "基本信息")
        res_year_dict["baseInfo"] = table.report_index("企业基本信息", year_base_table) if year_base_table else {}

        # 网站或网店信息
        year_web_table = table.table_clean(raw_year_base, "网站或网店信息")
        res_year_dict["website"] = table.report_index("网站或网店信息", year_web_table) if year_web_table else {}

        # 股东及出资信息
        year_share_table = table.table_clean(raw_year_base, "股东（发起人）及出资信息")
        res_year_dict["investorInformations"] = table.report_index("股东及出资信息",
                                                                   year_share_table) if year_share_table else []

        # 对外投资信息
        year_invest_table = table.table_clean(raw_year_base, "对外投资信息")
        res_year_dict["entinvItemList"] = table.report_index("对外投资信息", year_invest_table) if year_invest_table else []

        # 企业资产状况信息
        year_assets_table = table.table_clean(raw_year_base, "企业资产状况信息")
        res_year_dict["assetsInfo"] = table.report_index("企业资产状况信息", year_assets_table) if year_assets_table else {}

        # 股权变更信息
        year_equity_table = table.table_clean(raw_year_base, "股权变更信息")
        res_year_dict["equityChangeInformations"] = table.report_index("股权变更信息",
                                                                       year_equity_table) if year_equity_table else []

        # 修改记录
        year_change_table = table.table_clean(raw_year_base, "修改记录")
        change_record_list = table.report_index("修改记录", year_change_table) if year_change_table else []
        for a_change_record in change_record_list:
            for k, v in a_change_record.items():
                a_change_record[k] = re.sub('<.*?>', '', v)

        res_year_dict["changeRecords"] = change_record_list

        res_year_list.append(res_year_dict)

    return res_year_list


def search2(companyName, MAXTIME=40):
    res = ''
    asic_dict = {}
    # MAXTIME = 20
    a_time = MAXTIME
    while a_time > 0:
        # print res, '*'*20
        if res is None:  # 公司不存在
            return None
        elif res == '':  # 验证码错误
            if a_time < MAXTIME:
                logger.error("重复破解验证码!当前设定重复破解次数为:%s, 剩余次数为:%s " % (MAXTIME, a_time))
            a_time -= 1
            try:
                # time.sleep(10)
                res = download_captcha_kill(companyName)
                # print res
            except Exception, e:
                traceback.print_exc(e)
                raise e
        else:
            break
    if a_time <= 1 and res == '':
        raise Exception("多次破解验证码错误,当前设置次数为：%s" % MAXTIME)
    else:
        com_list = res
        res = get_company_info(com_list)
        raw_dict = res
        try:
            asic_dict = extract_base_info(raw_dict)
            year_list = extract_year_info(raw_dict)
            company_name = asic_dict['basicList'][0].get('enterpriseName', '')
            company_name = company_name if company_name else companyName
            res['companyName'] = company_name

            asic_dict['yearReportList'] = year_list
            gate_method = {
                'url': 'http://gsxt.scaic.gov.cn/',
                'method': 'post',
                'province': 'sc',
                'companyName': company_name,
                'data': com_list,
            }

            return res, asic_dict, gate_method

        except Exception, e:
            logger.info(e)
            res['companyName'] = companyName
            gate_method = {
                'url': 'http://gsxt.scaic.gov.cn/',
                'method': 'post',
                'province': 'sc',
                'companyName': companyName,
                'data': com_list,
            }
            return res, None, gate_method


def search(companyName):
    res = search2(companyName)
    if not res:
        return None
    else:
        return res[1]


def search3(gate_method):
    if 'data' not in gate_method:
        raise Exception("gate_method error, doesn't have `data` key")
    com_list = gate_method.get('data')

    # 兼容肖迪写的四川爬虫URL
    if isinstance(com_list, basestring):
        com_list = re.findall("'(.*?)','(.*?)','(.*?)'", com_list)
        if com_list:
            com_list = com_list[0]
        else:
            raise Exception("url error")

    res = get_company_info(com_list)
    companyName = gate_method.get('companyName', '')

    raw_dict = res
    try:
        asic_dict = extract_base_info(raw_dict)
        year_list = extract_year_info(raw_dict)
        company_name = asic_dict['basicList'][0].get('enterpriseName', '')
        company_name = company_name if company_name else companyName
        res['companyName'] = company_name

        asic_dict['yearReportList'] = year_list
        gate_method = {
            'url': 'http://gsxt.scaic.gov.cn/',
            'method': 'post',
            'province': 'sc',
            'companyName': company_name,
            'data': com_list,
        }

        return res, asic_dict, gate_method

    except Exception, e:
        logger.info(e)
        res['companyName'] = companyName
        gate_method = {
            'url': 'http://gsxt.scaic.gov.cn/',
            'method': 'post',
            'province': 'sc',
            'companyName': companyName,
            'data': com_list,
        }
        return res, None, gate_method


if __name__ == "__main__":
    # companyName = '荣县平民药房'
    # companyName = '成都速飞软件有限公司'
    # companyName = '四川成焊宝玛焊接装备工程有限公司'
    # companyName = '成都市高新区金坤小额贷款有限公司'
    companyName = '成都薪筹宝电子商务有限公司'

    result = search2(companyName)

    import json

    print json.dumps(result, indent=4, ensure_ascii=False)
